package text;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import utils.Logger;

/**
 * parse the Chinese text
 * 
 * @author huangcd
 */
public class ChineseStem
{
    static
    {
        System.loadLibrary("ChineseStem");
        if (!init())
        {
            Logger.logException(
                    "cannot init the stem process, program will be exit", true);
        }
        // 程序退出的时候关闭CLAS中文分词系统
        Runtime.getRuntime().addShutdownHook(new Thread()
        {
            public void run()
            {
                close();
                Logger.logInfo("close stem process");
            }
        });
    }

    private static native void close();

    private static native boolean init();

    private static native String parseNative(String string);

    public synchronized static String parse(String string)
    {
        // 去除除中文、英文以外的所有字符
        string = string.replaceAll("[^\u4E00-\u9FA5|A-Z|a-z]+", " ");
        String result = parseNative(" " + string + " ").trim();
        // 中文、英文之间加空格
        Matcher mat = Pattern.compile("[\u4E00-\u9FA5][A-Z|a-z]").matcher(
                new String(result));
        while (mat.find())
        {
            String str = mat.group();
            char[] array = str.toCharArray();
            result = result.replaceAll(str, array[0] + " " + array[1]);
        }
        // 把多个连续空格替换成一个空格
        result = result.replaceAll("\\s+", " ").trim().toLowerCase();
        return result;
    }
}
